Want to prioritize words that are genes
import pandas as pd
import requests
from glob import glob
import json
from copy import deepcopy
from clustergrammer2 import net
all_files = glob('../markdown_files/*.md')
len(all_files)
dict_altmetric = net.load_json_to_dict('../altmetric_data/altmetric_scores.json')
google_sheet_url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vRngfhDKqZUEhHuQY60n3Bh76gkMQKeOq6D7UYkSgt0KPP7rcCTE-PjMeWO1g1YlGVhBTAMJS6rn-pc/pub?gid=0&single=true&output=tsv'
r = requests.get(google_sheet_url)
import sys
if sys.version_info[0] < 3:
from StringIO import StringIO
else:
from io import StringIO
TESTDATA = StringIO(r.text)
df = pd.read_csv(TESTDATA, sep="\t", index_col=0)
df.index.name = None
url = 'https://connect.biorxiv.org/relate/collection_json.php?grp=181'
r = requests.get(url)
req_dict = json.loads(r.text)
stop_words = ["i","me","my","myself","we","us","our","ours","ourselves","you","your","yours","yourself","yourselves","he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their","theirs","themselves","what","which","who","whom","whose","this","that","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","will","would","should","can","could","ought","i'm","you're","he's","she's","it's","we're","they're","i've","you've","we've","they've","i'd","you'd","he'd","she'd","we'd","they'd","i'll","you'll","he'll","she'll","we'll","they'll","isn't","aren't","wasn't","weren't","hasn't","haven't","hadn't","doesn't","don't","didn't","won't","wouldn't","shan't","shouldn't","can't","cannot","couldn't","mustn't","let's","that's","who's","what's","here's","there's","when's","where's","why's","how's","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","upon","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","say","says","said","shall","2019","novel","patients","using","may","2019-ncov","2020"]
stop_words.extend(['2020,', 'conclusions', 'characteristics'])
stop_words.extend(['=', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
more_stop_words = ['data', 'results', 'study', 'used', 'also', 'analysis', 'two', 'one', 'different', 'however',
'early', 'first', 'found', 'new', 'well', 'show', 'three', 'use', 'important', 'method', 'observed',
'studies', 'across', 'due', 'likely', 'included', 'suggest', 'many', 'similar', 'around',
'several', 'still', 'even', 'basic', 'four', 'much', 'now', 'five', 'six', 'since', 'thus',
'especially', 'end', 'considered', 'result', 'find', 'general', 'best', 'thus']
stop_words.extend(more_stop_words)
doi_words = {}
all_words = []
doi_titles = {}
doi_site = {}
arr_papers = req_dict['rels']
for inst_paper in arr_papers:
# get words from abstract
inst_words = [x.lower().replace(':','').replace(',','').replace('.','')
.replace('(', '').replace(')', '')
.replace('\n','').replace('\t','')
for x in inst_paper['rel_abs'].split()]
# explicit drop words
inst_words = [x for x in inst_words if x not in stop_words]
# drop words that do not contain letters
inst_words = [x for x in inst_words if x.islower()]
# save words to dict
doi_words[inst_paper['rel_doi']] = sorted(list(set(inst_words)))
doi_titles[inst_paper['rel_doi']] = inst_paper['rel_title']
doi_site[inst_paper['rel_doi']] = inst_paper['rel_site']
all_words.extend(inst_words)
ser_titles = pd.Series(doi_titles)
ser_titles.head()
df_meta = pd.DataFrame(ser_titles, columns=['Title'])
df_meta.shape
inst_paper.keys()
for inst_paper in arr_papers:
inst_doi = inst_paper['rel_doi']
# date
inst_date = inst_paper['rel_date'].split('-')
df_meta.loc[inst_doi, 'date'] = float( inst_date[1] + '.' + inst_date[2])
# altmetric score
if inst_doi in dict_altmetric:
df_meta.loc[inst_doi, 'altmetric'] = dict_altmetric[inst_doi]
else:
print('not found')
df_meta.loc[inst_doi, 'altmetric'] = 0
ser_count = pd.Series(all_words).value_counts()
ser_count = ser_count[ser_count < len(arr_papers) * 0.75 ][ser_count > 5]
ser_count.shape
ser_count.plot()
top_words = ser_count.index.tolist()[:1000]
all_dois = sorted(list(doi_words.keys()))
len(all_dois)
df_words = pd.DataFrame(0, index=top_words, columns=all_dois)
for inst_doi in all_dois:
inst_words = list(set(doi_words[inst_doi]).intersection(top_words))
df_words.loc[inst_words, inst_doi] = 1
cols = df_words.columns.tolist()
grade_dict = {}
for inst_col in cols:
if inst_col in df.index.tolist():
grade_dict[inst_col] = str(df.loc[inst_col, 'Grade'])\
.replace('2-3', '3')\
.replace('2-1', '2')\
.replace('1-2', '2')\
.replace('1/2', '2')\
.replace('nan', 'N.A.').replace('?','')
else:
grade_dict[inst_col] = 'N.A.'
new_cols = [(df_meta.loc[x, 'Title'][:50],
'Site: ' + doi_site[x],
'Grade: ' + str(grade_dict[x]),
'Date: ' + str(df_meta.loc[x, 'date']),
'Altmetric: ' + str(df_meta.loc[x, 'altmetric']) ) for x in cols]
df_cat = deepcopy(df_words)
df_cat.columns = new_cols
cat_colors = {}
cat_colors['biorxiv'] = 'blue'
cat_colors['red'] = 'red'
cat_colors['N.A.'] = 'white'
cat_colors['nan'] = 'white'
cat_colors['1'] = '#FFD700'
cat_colors['2'] = '#FF6347'
cat_colors['3'] = '#add8e6'
ser_sum = df_cat.sum(axis=0)
keep_cols = ser_sum[ser_sum > 0].index.tolist()
net.load_df(df_cat[keep_cols])
net.set_cat_colors(axis='col', cat_index=1, cat_title='Site', cat_colors=cat_colors)
net.set_cat_colors(axis='col', cat_index=2, cat_title='Grade', cat_colors=cat_colors)
net.filter_N_top(axis='row', rank_type='sum', N_top=500)
net.cluster(dist_type='jaccard')
net.widget()
net.save_dict_to_json(net.viz, '../json_files/heatmap_2020-04-05.json')
# words_list = []
# for inst_file in all_files:
# f = open(inst_file, 'r')
# lines = f.readlines()
# f.close()
# for inst_line in lines:
# inst_line = inst_line.lower()
# inst_words = inst_line.split(' ')
# inst_words = [x for x in inst_words if '*' not in x]
# words_list.extend(inst_words)
# pd.Series(words_list).value_counts().head(50)